Conclusions at the bottom of the page
# List of required packages
libraries <- c(
"haven", "dplyr", "tidyr", "ggplot2", "reshape2",
"viridis", "knitr", "kableExtra"
)
# Install missing packages
for (pkg in libraries) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
# Load all packages
invisible(lapply(libraries, library, character.only = TRUE))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## Loading required package: viridisLite
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
dat <- read_sas("alzheimer25.sas7bdat")
str(dat)
## tibble [1,253 × 38] (S3: tbl_df/tbl/data.frame)
## $ patid : num [1:1253] 10001 10002 10003 10004 10005 ...
## $ trial : num [1:1253] 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : num [1:1253] 0 0 0 1 1 1 0 0 0 1 ...
## $ age : num [1:1253] 72 74 74 75 71 72 75 76 68 73 ...
## $ edu : num [1:1253] 3 3 1 3 2 3 1 4 3 4 ...
## $ bmi : num [1:1253] 26.4 27.1 27.3 24.8 28.2 ...
## $ inkomen: num [1:1253] 1900 1900 1000 2500 2000 2300 1300 2300 1900 3000 ...
## $ job : num [1:1253] 0 0 0 0 0 0 0 0 1 0 ...
## $ adl : num [1:1253] 6 5 6 5 6 6 6 6 14 6 ...
## $ wzc : num [1:1253] 0 0 0 1 1 1 0 0 0 1 ...
## $ cdrsb0 : num [1:1253] 4 1 2 1 19 1 19 19 7 4 ...
## $ cdrsb1 : num [1:1253] 1 3 1 3 11 2 6 12 6 1 ...
## $ cdrsb2 : num [1:1253] 1 3 12 6 12 4 4 15 8 6 ...
## $ cdrsb3 : num [1:1253] 6 8 9 7 10 5 11 8 10 7 ...
## $ cdrsb4 : num [1:1253] 2 NA 8 NA NA 6 6 4 3 10 ...
## $ cdrsb5 : num [1:1253] 16 NA 13 NA NA NA 2 2 19 NA ...
## $ cdrsb6 : num [1:1253] NA NA 19 NA NA NA NA NA 1 NA ...
## $ bprs0 : num [1:1253] 72 77 79 81 72 76 78 80 62 80 ...
## $ bprs1 : num [1:1253] 79 83 85 90 82 85 84 88 72 87 ...
## $ bprs2 : num [1:1253] 86 91 92 95 84 91 92 93 80 93 ...
## $ bprs3 : num [1:1253] 95 98 103 108 92 97 101 103 89 98 ...
## $ bprs4 : num [1:1253] 97 NA 108 NA NA 108 102 109 99 104 ...
## $ bprs5 : num [1:1253] 108 NA 118 NA NA NA 111 113 102 NA ...
## $ bprs6 : num [1:1253] NA NA 123 NA NA NA NA NA 112 NA ...
## $ abpet0 : num [1:1253] 2 2 2 2.06 2 ...
## $ abpet1 : num [1:1253] 2 2.02 2.03 2.94 2 ...
## $ abpet2 : num [1:1253] 2 2.23 2.61 3 2 ...
## $ abpet3 : num [1:1253] 2 2.97 2.98 3 2 ...
## $ abpet4 : num [1:1253] 2 NA 3 NA NA ...
## $ abpet5 : num [1:1253] 2.04 NA 3 NA NA ...
## $ abpet6 : num [1:1253] NA NA 3 NA NA NA NA NA 2 NA ...
## $ taupet0: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
## $ taupet1: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
## $ taupet2: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
## $ taupet3: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
## $ taupet4: num [1:1253] 1.9 NA 1.9 NA NA ...
## $ taupet5: num [1:1253] 1.9 NA 2.03 NA NA ...
## $ taupet6: num [1:1253] NA NA 2.79 NA NA ...
## - attr(*, "label")= chr "alzheimer25 dataset written by Stat/Transfer Ver. 15.1.1403.1030"
colSums(is.na(dat))
## patid trial sex age edu bmi inkomen job adl wzc
## 0 0 0 0 0 0 0 0 0 0
## cdrsb0 cdrsb1 cdrsb2 cdrsb3 cdrsb4 cdrsb5 cdrsb6 bprs0 bprs1 bprs2
## 0 147 239 346 476 601 742 0 147 239
## bprs3 bprs4 bprs5 bprs6 abpet0 abpet1 abpet2 abpet3 abpet4 abpet5
## 346 476 601 742 0 147 239 346 476 601
## abpet6 taupet0 taupet1 taupet2 taupet3 taupet4 taupet5 taupet6
## 742 0 147 239 346 476 601 742
reshape_dat <- function(dat) {
dat_long <- dat %>%
pivot_longer(
cols = matches("^(bprs|cdrsb|abpet|taupet)\\d+"),
names_to = c(".value", "time"),
names_pattern = "(.+)(\\d+)"
)
dat_long
}
to_factor <- function(dat_long) {
dat_long <- dat_long %>%
mutate(
time = as.factor(time), # coded 0-6
sex = as.factor(sex),
edu = as.factor(edu),
trial = as.factor(trial),
job = as.factor(job),
wzc = as.factor(wzc),
time_num = as.numeric(time) # coded 1-7
)
dat_long
}
dat_long <- reshape_dat(dat)
dat_long <- to_factor(dat_long)
plot_df <- dat_long %>%
pivot_longer(
cols = c(bprs, cdrsb, abpet, taupet),
names_to = "variable",
values_to = "value"
) %>%
rename(year = time_num)
mean_var_summary <- plot_df %>%
group_by(variable, year) %>%
summarise(
mean_value = mean(value, na.rm = TRUE),
var_value = var(value, na.rm = TRUE),
sd_value = sd(value, na.rm = TRUE),
n = sum(!is.na(value)),
se_value = sd_value / sqrt(n),
.groups = "drop"
)
ggplot(mean_var_summary, aes(x = year, y = mean_value)) +
geom_line(color = "firebrick", linewidth = 1.2) +
geom_point(color = "firebrick", size = 2) +
geom_ribbon(aes(ymin = mean_value - sd_value,
ymax = mean_value + sd_value),
fill = "firebrick", alpha = 0.2) +
facet_wrap(~ variable, scales = "free_y", ncol = 2) +
labs(
title = "Mean ± SD per variable across years",
x = "Year", y = "Mean ± SD"
) +
theme_minimal(base_size = 14) +
theme(strip.text = element_text(face = "bold"))
M <- cor(dat[, c("cdrsb0", "bprs0", "abpet0", "taupet0",
"inkomen", "bmi", "job", "trial", "adl", "age", "wzc")],
use = "pairwise.complete.obs")
melted_M <- reshape2::melt(M)
ord <- hclust(dist(M))$order
melted_M$Var1 <- factor(melted_M$Var1, levels = rownames(M)[ord])
melted_M$Var2 <- factor(melted_M$Var2, levels = colnames(M)[ord])
ggplot(melted_M, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white", linewidth = 0.4) + # thin white grid lines
scale_fill_gradient2(
low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0,
limits = c(-1, 1),
name = "Correlation (-1 : 1)"
) +
coord_fixed() +
labs(
title = "Correlation Heatmap of Continuous Features at Baseline ",
x = NULL, y = NULL
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 11, face = "bold"),
axis.text.y = element_text(size = 11, face = "bold"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 13, hjust = 0.5, color = "gray30"),
panel.grid = element_blank(),
legend.position = "right",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 10)
)
desired_order <- c("bprs0","bprs1","bprs2","bprs3","bprs4","bprs5","bprs6")
M <- cor(dat[, desired_order],
use = "pairwise.complete.obs")
melted_M <- reshape2::melt(M)
melted_M$Var1 <- factor(melted_M$Var1, levels = desired_order)
melted_M$Var2 <- factor(melted_M$Var2, levels = desired_order)
ggplot(melted_M, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white", linewidth = 0.4) + # thin white grid lines
scale_fill_gradient2(
low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0.9,
limits = c(0.8, 1),
name = "Correlation (0.8 : 1)"
) +
coord_fixed() +
labs(
title = "Correlation Heatmap of BPRS scores",
x = NULL, y = NULL
)
ggplot(dat, aes(x = age, fill = as.factor(sex))) +
geom_density(alpha = 0.5, color = "black", linewidth = 0.4) +
scale_fill_manual(
values = c("#4575b4", "#d73027"),
labels = c('Male', 'Female'),
name = 'Sex'
) +
labs(
title = "Age Distribution at baseline by Sex",
x = "Age",
y = "Density"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
legend.position = "top",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 11),
axis.text = element_text(size = 11),
panel.grid.minor = element_blank()
)
dat <- dat %>%
mutate(
sex = factor(sex, labels = c("Male", "Female")),
wzc = factor(wzc, labels = c("Other", "Nursing Home Resident"))
)
dat_long <- dat_long %>%
mutate(
sex = factor(sex, labels = c("Male", "Female")),
wzc = factor(wzc, labels = c("Other", "Nursing Home Resident"))
)
# Individual points
ggplot(dat_long, aes(x = time, y = bprs, color = sex)) +
geom_jitter(alpha = 0.6) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Individual BPRS Score by Time and Sex',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Individual trajectories
ggplot(dat_long, aes(x = time, y = bprs, color = sex, group = patid)) +
geom_line(alpha = 0.4) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Individual BPRS Score Profiles by Time and Sex',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Average profile ± SE by sex
ggplot(dat_long, aes(x = time, y = bprs, color = sex, group = sex)) +
stat_summary(fun = mean, geom = "line", size = 1) +
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.3) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Average BPRS Score by Time and Sex',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
# Boxplot of age at baseline
ggplot(dat, aes(x = sex, y = age, fill = sex)) +
geom_boxplot(alpha = 0.6, color = "black") +
scale_fill_manual(values = c("#4575b4", "#d73027")) +
theme_minimal(base_size = 14) +
labs(title = "Age at baseline by Sex", x = "Sex", y = "Age") +
theme(legend.position = "none")
alive_at_year6 <- dat %>%
filter(!is.na(bprs6)) %>%
mutate(age_year6 = age + 6)
# Boxplot of age at year 6
ggplot(alive_at_year6, aes(x = sex, y = age_year6, fill = sex)) +
geom_boxplot(alpha = 0.6, color = "black") +
scale_fill_manual(values = c("#4575b4", "#d73027")) +
theme_minimal(base_size = 14) +
labs(
title = "Age at Year 6 by Sex (Only Participants with BPRS6 Available)",
x = "Sex",
y = "Age at Year 6"
) +
theme(legend.position = "none")
# Dropouts by sex
dropouts_sex <- dat %>%
group_by(sex) %>%
summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
ungroup()
dropouts_sex %>%
kable(caption = "Number of Patients by Sex at Each Measurement Year") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| sex | bprs0 | bprs1 | bprs2 | bprs3 | bprs4 | bprs5 | bprs6 |
|---|---|---|---|---|---|---|---|
| Male | 616 | 583 | 551 | 506 | 463 | 409 | 341 |
| Female | 637 | 523 | 463 | 401 | 314 | 243 | 170 |
# Individuals points
ggplot(dat_long, aes(x = time, y = bprs, color = age)) +
geom_jitter(alpha = 0.6) +
scale_color_viridis_c(name = "Age") +
theme_minimal() +
labs(
title = 'Individual BPRS Scores by Time and Age',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
set.seed(123)
sample_ids <- dat_long %>%
distinct(patid) %>%
sample_n(150) %>%
pull(patid)
dat_long_sample <- dat_long %>%
filter(patid %in% sample_ids)
# Individual trajectories
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = age)) +
geom_line(alpha = 0.5, linewidth = 0.8) +
scale_color_viridis_c(name = "Age") +
theme_minimal() +
labs(
title = "Individual BPRS Profiles by Time and Age (Sample of 150 Patients)",
x = "Year",
y = "BPRS Score"
)
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Dropouts by age
dropouts_age <- dat %>%
group_by(age) %>%
summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
ungroup()
dropouts_age %>%
kable(caption = "Number of Patients by Age at Each Measurement Year") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| age | bprs0 | bprs1 | bprs2 | bprs3 | bprs4 | bprs5 | bprs6 |
|---|---|---|---|---|---|---|---|
| 46 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 51 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 52 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 53 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 54 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 55 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 56 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 57 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 58 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 59 | 15 | 15 | 15 | 15 | 15 | 15 | 15 |
| 60 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 61 | 17 | 17 | 17 | 17 | 17 | 17 | 17 |
| 62 | 31 | 31 | 31 | 31 | 31 | 31 | 31 |
| 63 | 27 | 27 | 27 | 27 | 27 | 27 | 26 |
| 64 | 37 | 37 | 37 | 37 | 37 | 37 | 35 |
| 65 | 35 | 35 | 35 | 34 | 34 | 31 | 27 |
| 66 | 49 | 49 | 49 | 49 | 46 | 45 | 39 |
| 67 | 60 | 60 | 60 | 59 | 58 | 53 | 42 |
| 68 | 58 | 57 | 57 | 54 | 50 | 44 | 37 |
| 69 | 56 | 56 | 56 | 53 | 49 | 41 | 33 |
| 70 | 61 | 61 | 59 | 55 | 53 | 49 | 40 |
| 71 | 68 | 68 | 68 | 61 | 50 | 38 | 26 |
| 72 | 84 | 82 | 77 | 71 | 58 | 42 | 27 |
| 73 | 64 | 60 | 55 | 44 | 38 | 30 | 20 |
| 74 | 51 | 50 | 48 | 40 | 28 | 24 | 13 |
| 75 | 66 | 63 | 51 | 44 | 36 | 23 | 9 |
| 76 | 61 | 52 | 50 | 45 | 33 | 23 | 10 |
| 77 | 65 | 57 | 50 | 43 | 29 | 18 | 8 |
| 78 | 34 | 29 | 25 | 16 | 10 | 7 | 3 |
| 79 | 45 | 37 | 32 | 26 | 15 | 4 | 2 |
| 80 | 44 | 30 | 19 | 15 | 10 | 3 | 1 |
| 81 | 34 | 27 | 16 | 7 | 0 | 0 | 0 |
| 82 | 33 | 16 | 11 | 9 | 2 | 0 | 0 |
| 83 | 23 | 15 | 10 | 5 | 2 | 1 | 1 |
| 84 | 24 | 10 | 4 | 1 | 0 | 0 | 0 |
| 85 | 19 | 9 | 4 | 0 | 0 | 0 | 0 |
| 86 | 12 | 5 | 4 | 2 | 2 | 2 | 2 |
| 87 | 14 | 3 | 0 | 0 | 0 | 0 | 0 |
| 88 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
| 89 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 90 | 3 | 1 | 0 | 0 | 0 | 0 | 0 |
| 91 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 92 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 93 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 94 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
# Individual points
ggplot(dat_long, aes(x = time, y = bprs, color = wzc)) +
geom_jitter(alpha = 0.6) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Individual BPRS Scores by Time and WZC',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
set.seed(123)
sample_ids <- dat_long %>%
distinct(patid) %>%
sample_n(150) %>%
pull(patid)
dat_long_sample <- dat_long %>%
filter(patid %in% sample_ids)
# Individual trajectories
ggplot(dat_long, aes(x = time, y = bprs, color = wzc, group = patid)) +
geom_line(alpha = 0.4) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Individual BPRS Profiles by Time by Sex',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Average profile ± SE by WZC
ggplot(dat_long, aes(x = time, y = bprs, color = wzc, group = wzc)) +
stat_summary(fun = mean, geom = "line", size = 1) +
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.3) +
scale_color_manual(values = c("#4575b4", "#d73027")) +
theme_minimal() +
labs(
title = 'Average BPRS Score by Time and WZC',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
ggplot(dat, aes(x = wzc, y = age, fill = wzc)) +
geom_boxplot(alpha = 0.6, color = "black") +
scale_fill_manual(values = c("#4575b4", "#d73027")) +
theme_minimal(base_size = 14) +
labs(title = "Age at baseline by WZC", x = "WZC", y = "Age") +
theme(legend.position = "none")
dropouts_wzc <- dat %>%
group_by(wzc) %>%
summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
ungroup()
dropouts_wzc %>%
kable(caption = "Number of Patients by WZC at Each Measurement Year") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| wzc | bprs0 | bprs1 | bprs2 | bprs3 | bprs4 | bprs5 | bprs6 |
|---|---|---|---|---|---|---|---|
| Other | 766 | 739 | 721 | 686 | 626 | 559 | 465 |
| Nursing Home Resident | 487 | 367 | 293 | 221 | 151 | 93 | 46 |
# Individuals points
ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(trial))) +
geom_jitter(alpha = 0.6) +
scale_fill_manual(
values = grDevices::rainbow(length(unique(dat$trial))),
name = "Trial"
) +
theme_minimal() +
labs(
title = 'Individual BPRS Scores as by Time and Trial',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Individual trajectories
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(trial))) +
geom_line(alpha = 0.5, linewidth = 0.8) +
scale_fill_manual(
values = grDevices::rainbow(length(unique(dat$trial))),
name = "Trial"
) +
theme_minimal() +
labs(
title = "Individual BPRS Profiles by Time and Trial (Sample of 150 Patients)",
x = "Year",
y = "BPRS Score"
)
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggplot(dat, aes(x = as.factor(trial), y = age, fill = as.factor(trial))) +
geom_boxplot(alpha = 0.7, color = "black") +
scale_fill_manual(
values = grDevices::rainbow(length(unique(dat$trial))),
name = "Trial"
) +
theme_minimal(base_size = 14) +
labs(title = "Age at baseline by Trial", x = "Trial", y = "Age") +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(job))) +
geom_jitter(alpha = 0.6) +
scale_color_manual(
values = grDevices::rainbow(length(unique(dat$job))),
name = "Job"
) +
theme_minimal() +
labs(
title = 'Individual BPRS Scores by Time and Job',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(job))) +
geom_line(alpha = 0.5, linewidth = 0.8) +
scale_color_manual(
values = grDevices::rainbow(length(unique(dat$job))),
name = "Job"
) +
theme_minimal() +
labs(
title = "Individual BPRS Profiles by Time and Job (Sample of 150 Patients)",
x = "Year",
y = "BPRS Score"
)
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggplot(dat, aes(x = as.factor(job), y = age, fill = as.factor(job))) +
geom_boxplot(alpha = 0.7, color = "black") +
scale_fill_manual(
values = grDevices::rainbow(length(unique(dat$job))),
name = "Job"
) +
theme_minimal(base_size = 14) +
labs(
title = "Age at Baseline by Job",
x = "Job",
y = "Age"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(adl))) +
geom_jitter(alpha = 0.6) +
scale_color_manual(
values = grDevices::rainbow(length(unique(dat$adl))),
name = "ADL"
) +
theme_minimal() +
labs(
title = 'Individual BPRS Scores by Time and ADL',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(adl))) +
geom_line(alpha = 0.5, linewidth = 0.8) +
scale_color_manual(
values = grDevices::rainbow(length(unique(dat$adl))),
name = "ADL"
) +
theme_minimal() +
labs(
title = "Individual BPRS Profiles by Time and ADL (Sample of 150 Patients)",
x = "Year",
y = "BPRS Score"
)
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggplot(dat, aes(x = as.factor(adl), y = age, fill = as.factor(adl))) +
geom_boxplot(alpha = 0.7, color = "black", outlier.size = 0.8) +
scale_fill_manual(
values = grDevices::rainbow(length(unique(dat$adl))),
name = "ADL"
) +
theme_minimal(base_size = 14) +
labs(
title = "Age at Baseline by ADL",
x = "ADL Category",
y = "Age"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
## EDA by BMI
ggplot(dat_long, aes(x = time, y = bprs, color = bmi)) +
geom_jitter(alpha = 0.6) +
scale_color_viridis_c(name = "BMI") +
theme_minimal() +
labs(
title = 'Individual BPRS Scores by Time and BMI',
x = 'Year',
y = 'BPRS Score'
)
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = bmi)) +
geom_line(alpha = 0.5, linewidth = 0.8) +
scale_color_viridis_c(name = "BMI") +
theme_minimal() +
labs(
title = "Individual BPRS Profiles by Time and BMI (Sample of 150 Patients)",
x = "Year",
y = "BPRS Score"
)
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
dat <- dat %>%
mutate(age_bin = cut(
age,
breaks = c(-Inf, 60,70, 80, Inf),
labels = c("<60", "60-70", "70-80", "80+"),
right = FALSE
))
ggplot(dat, aes(x = age_bin, y = bmi, fill = age_bin)) +
geom_boxplot(alpha = 0.7, color = "black") +
scale_fill_manual(values = c("#b3cde0", "#005b96", "#03396c","#011f4b")) +
theme_minimal(base_size = 14) +
labs(
title = "BMI at Baseline by Age Group",
x = "Age Group",
y = "BMI"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
non_na_counts <- dat %>%
summarise(across(starts_with("bprs"), ~ sum(!is.na(.x))))
plot_data <- non_na_counts %>%
pivot_longer(
cols = everything(),
names_to = "measurement_occasion",
values_to = "count"
) %>%
mutate(year = as.numeric(gsub("bprs", "", measurement_occasion)))
ggplot(plot_data, aes(x = year, y = count)) +
geom_col(fill = "steelblue", color = "black") +
geom_text(aes(label = count), vjust = -0.5) +
theme_minimal() +
labs(
title = "Number of Available Observations at Each Time Point",
x = "Year",
y = "Number of available observations"
)
## Correlation between dropout rate and age and bprs at baseline
dat$NAs_1 <- as.numeric(is.na(dat$bprs1))
dat$NAs_2 <- as.numeric(is.na(dat$bprs2))
dat$NAs_3 <- as.numeric(is.na(dat$bprs3))
dat$NAs_4 <- as.numeric(is.na(dat$bprs4))
dat$NAs_5 <- as.numeric(is.na(dat$bprs5))
dat$NAs_6 <- as.numeric(is.na(dat$bprs6))
corr_data <- dat %>%
select(age, bprs0, NAs_1:NAs_6)
cor_matrix <- cor(corr_data, use="pairwise.complete")
melted <- melt(cor_matrix)
ggplot(melted, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white", linewidth = 0.4) + # thin white grid lines
scale_fill_gradient2(
low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0,
limits = c(-1, 1),
name = "Correlation (-1 : 1)"
) +
coord_fixed() +
labs(
title = "Correlation Heatmap ",
subtitle = "Number of Dropouts with Age and BPRS at Baseline",
x = NULL, y = NULL
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 11, face = "bold"),
axis.text.y = element_text(size = 11, face = "bold"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 13, hjust = 0.5, color = "gray30"),
panel.grid = element_blank(),
legend.position = "right",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 10)
)
# Paired t-tests baseline (0) vs final (6)
t.test(dat$cdrsb0, dat$cdrsb6, paired = TRUE)
##
## Paired t-test
##
## data: dat$cdrsb0 and dat$cdrsb6
## t = -8.73, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -6.681396 -4.226628
## sample estimates:
## mean difference
## -5.454012
t.test(dat$bprs0, dat$bprs6, paired = TRUE)
##
## Paired t-test
##
## data: dat$bprs0 and dat$bprs6
## t = -173.5, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -42.78828 -41.83012
## sample estimates:
## mean difference
## -42.3092
t.test(dat$abpet0, dat$abpet6, paired = TRUE)
##
## Paired t-test
##
## data: dat$abpet0 and dat$abpet6
## t = -11.407, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.2096247 -0.1480270
## sample estimates:
## mean difference
## -0.1788258
t.test(dat$taupet0,dat$taupet6,paired = TRUE)
##
## Paired t-test
##
## data: dat$taupet0 and dat$taupet6
## t = -9.1044, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.13856669 -0.08937852
## sample estimates:
## mean difference
## -0.1139726
# We fit a standard linear model (OLS) ignoring the patient-specific correlations.
preliminary_ols_model <- lm(bprs ~ time_num + sex + age + wzc + adl,
data = dat_long, na.action = na.exclude)
# The residuals represent the part of the BPRS score that is NOT explained by the average trend.
# In theory, this is what's left over for the random effects and residual error to explain.
dat_long$ols_residuals = residuals(preliminary_ols_model)
# If the random effects were just a random intercept (b_0i), these profiles should be flat but
# shifted up or down for each patient, with random noise.
# If there is a random slope (b_1i), we would expect to see individual slopes in these residual profiles.
ggplot(dat_long, aes(x = time_num, y = ols_residuals, group = patid)) +
geom_line(alpha = 0.3) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "OLS Residual Profiles for Each Patient",
x = "Year (Numeric)",
y = "OLS Residuals") +
theme_minimal()
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).
Summarize your conclusions. What are the implications with respect to statistical modeling ?
From our exploratory data analysis, we observe that each patient’s clinical and neuroimaging features were measured up to six times, corresponding to annual follow-ups over six years. The variable of interest, BPRS, which measures the severity of psychiatric symptoms, shows a linear trend over time. The variance appears to remain more or less constant across the years.
Among the four measures, BPRS shows by far the greatest change across time. This indicates that behavioral and psychiatric deterioration dominates the longitudinal trajectory, whereas cognitive decline (CDRSB) is more moderate, and biological markers (ABPET, TAUPET) show much smaller absolute shifts. All variables show statistically significant changes, but their effect sizes differ dramatically, emphasizing that statistical significance does not necessarily imply clinical importance*
*(Matteo: I am not sure anymore about this sentence, because we have no idea how clinically relevant is a big change in one feature. As in, maybe a smaller absolute change in ABPET has much stronger clinical effects than a bigger absolute change in BPRS).
The correlation matrix indicates that BPRS at baseline is positively correlated with age, WZC, and other clinical features, while it is negatively correlated with job, ADL, trial, and income. However, further exploration suggests that age is likely the main driver of variation in BPRS.
A spurious association is observed between sex and BPRS: women appear to have higher BPRS scores than men, but this can be explained by their higher mean age at baseline. Similar spurious relationships are found for WZC, job, ADL, and trial, all of which are confounded by age.
The longitudinal plots confirm a positive, linear association between age and BPRS. There is also considerable attrition over time, with the number of observations decreasing from 1,253 at baseline to 511 at year six. Approximately 63% of the dropouts are women, likely due to their higher baseline age, which may have led to faster progression of psychiatric symptoms or inability to continue participation. It is plausible that participants with high baseline BPRS scores either died or became too impaired to remain in the study.
This pattern of dropout might explain why the overall variance of BPRS appears constant over time. Although sample size decreases, the reduction in the range of BPRS values among remaining participants likely counterbalances the expected increase in variance.
Based on these findings, a linear mixed-effects model is the most appropriate statistical approach for analyzing this longitudinal dataset. Time and age should be included as fixed effects, with patient ID modeled as a random effect to account for repeated measures within individuals. Since patients are also nested within trials, trial can be included as an additional random effect. The residuals extracted from the OLS model follow this formula: \(r_i = y_i - X_i\hat{\beta}_{OLS} \approx Z_ib_i + \epsilon_i\), which implies that after taking into account the fixed effects, any variance left is the sum of subject-specific effects plus any residual variance. From the spaghetti plot of the residuals we can confirm that different subjects need different intercepts. This plot also reveals the need of subject-specific slopes, as some subjects’ bprs score increase faster or slower than the average over time. The fact that the residuals cluster around 0 shows that the distributional assumption \(E \left( \begin{pmatrix} b_{0i} \\ b_{1i} \end{pmatrix} \right) = \begin{pmatrix} 0 \\ 0 \end{pmatrix}\) is met.
Finally, given that the data are collected at regular time intervals, it is reasonable to assume an autoregressive correlation structure for the within-subject residuals, capturing the expected correlation between adjacent time points (plot a semivariogram once the linear mixed model is fitted).